GitHub Repository: debakarr/machinelearning
Path: blob/master/Part 1 - Data Preprocessing/[Python] Data Preprocessing.ipynb
¹³³³ views

Kernel: Python 3

Data Preprocessing

Importing the libraries

In [1]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
%matplotlib inline

Import the dataset

In [2]:

dataset = pd.read_csv('Data.csv')

In [3]:

dataset

Out[3]:

In [4]:

X = dataset.iloc[:, :-1].values # matrix of features/independent variables

In [5]:

Out[5]:

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [6]:

Y = dataset.iloc[:, 3].values # dependent variables

In [7]:

Out[7]:

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'], dtype=object)

Taking care of missing data

In [8]:

imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [9]:

Out[9]:

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

Encoding categorical data

In [10]:

labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

In [11]:

Out[11]:

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [12]:

oneHotEncoder = OneHotEncoder(categorical_features=[0])
X = oneHotEncoder.fit_transform(X).toarray()

In [13]:

X # 1st column is replaced by 3 column
# 1st column represents France, 2nd represents Germany and 3rd Spain

Out[13]:

array([[  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.40000000e+01,   7.20000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          2.70000000e+01,   4.80000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          3.00000000e+01,   5.40000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.80000000e+01,   6.10000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          4.00000000e+01,   6.37777778e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.50000000e+01,   5.80000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.87777778e+01,   5.20000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.80000000e+01,   7.90000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          5.00000000e+01,   8.30000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.70000000e+01,   6.70000000e+04]])

In [14]:

labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)

In [15]:

Out[15]:

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

Splitting the dataset into the Training set and Test set

In [16]:

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [17]:

X_train

Out[17]:

array([[  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.50000000e+01,   5.80000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.40000000e+01,   7.20000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.80000000e+01,   7.90000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          3.00000000e+01,   5.40000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.70000000e+01,   6.70000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          4.00000000e+01,   6.37777778e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.80000000e+01,   6.10000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.87777778e+01,   5.20000000e+04]])

In [18]:

len(X_train)

Out[18]:

8

In [19]:

X_test

Out[19]:

array([[  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          5.00000000e+01,   8.30000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          2.70000000e+01,   4.80000000e+04]])

In [20]:

len(X_test)

Out[20]:

2

In [21]:

Y_train

Out[21]:

array([1, 0, 1, 0, 1, 1, 0, 0])

In [22]:

len(Y_train)

Out[22]:

8

In [23]:

Y_test

Out[23]:

array([0, 1])

In [24]:

len(Y_test)

Out[24]:

2

Feature Scaling

In [25]:

scale_X = StandardScaler()
X_train = scale_X.fit_transform(X_train)
X_test = scale_X.transform(X_test)

In [26]:

X_train

Out[26]:

array([[ 1.        , -0.57735027, -0.57735027, -0.7529426 , -0.62603778],
       [ 1.        , -0.57735027, -0.57735027,  1.00845381,  1.01304295],
       [ 1.        , -0.57735027, -0.57735027,  1.79129666,  1.83258331],
       [-1.        ,  1.73205081, -0.57735027, -1.73149616, -1.09434656],
       [ 1.        , -0.57735027, -0.57735027, -0.36152118,  0.42765698],
       [-1.        ,  1.73205081, -0.57735027,  0.22561096,  0.05040824],
       [-1.        , -0.57735027,  1.73205081, -0.16581046, -0.27480619],
       [-1.        , -0.57735027,  1.73205081, -0.01359102, -1.32850095]])

In [27]:

X_test

Out[27]:

array([[-1.        ,  1.73205081, -0.57735027,  2.18271808,  2.30089209],
       [-1.        , -0.57735027,  1.73205081, -2.3186283 , -1.79680973]])

In [ ]:

Data Preprocessing

Importing the libraries

Import the dataset

Taking care of missing data

Encoding categorical data

Splitting the dataset into the Training set and Test set

Feature Scaling

Product

Resources

Company